// vim: set syntax=asm :

{{L}}{{label}}:
    mov             rax, [ rdi + 8 ]

{% capture mr_over_16 %}{{ mr | divided_by: 16}}{%endcapture%}
{% capture mr_over_16_min_1 %}{{ mr | divided_by: 16 | minus: 1}}{%endcapture%}

{%capture tmp%}{{to | plus: 1 }}{%endcapture%}

{%capture cols%}{{to | plus: 1| minus:from| divided_by:mr_over_16}}{%endcapture%}
{%capture cols_min_1%}{{to | plus: 1| minus:from| divided_by:mr_over_16|minus:1}}{%endcapture%}
// {{to|minus:from|plus:1}} cols:{{cols}}

{% for right in (0..cols_min_1) %}
    vbroadcastss    zmm{{tmp}}, dword ptr [ rax ]
    add             rax, 4

    {% for down in (0..mr_over_16_min_1) %}
        {%capture acc%}{{mr_over_16|times:right|plus:from|plus:down}}{%endcapture%}
        {% if flipped %}
            {{op}} zmm{{acc}}, zmm{{acc}}, zmm{{tmp}}
        {% else %}
            {{op}} zmm{{acc}}, zmm{{tmp}}, zmm{{acc}}
        {% endif %}
    {% endfor %}
{% endfor %}

    jmp {{L}}non_linear_loop
